The goal of this assignment is to create a linear regression model of baseball team statistics to predict the number of wins for a given team.
Following example: http://www.sthda.com/english/articles/40-regression-analysis/168-multiple-linear-regression-in-r/#:~:text=Multiple%20linear%20regression%20is%20an,distinct%20predictor%20variables%20(x).&text=The%20%E2%80%9Cb%E2%80%9D%20values%20are%20called,weights%20(or%20beta%20coefficients).
Following approach here, too: https://machinelearningmastery.com/machine-learning-in-r-step-by-step/
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.2 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(ggExtra)
# Read in CSV file of training dat
mb_train_init <- read.csv("moneyball-training-data.csv")
# Dimensions of the dataset
dim(mb_train_init)
## [1] 2276 17
# list types of each attribute
sapply(mb_train_init, class)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## "integer" "integer" "integer" "integer"
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## "integer" "integer" "integer" "integer"
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## "integer" "integer" "integer" "integer"
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## "integer" "integer" "integer" "integer"
## TEAM_FIELDING_DP
## "integer"
# first five rows
head(mb_train_init)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 1 39 1445 194 39
## 2 2 70 1339 219 22
## 3 3 86 1377 232 35
## 4 4 70 1387 209 38
## 5 5 82 1297 186 27
## 6 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 NA
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 NA NA 9364 84
## 2 28 NA 1347 191
## 3 27 NA 1377 137
## 4 30 NA 1396 97
## 5 39 NA 1297 102
## 6 59 NA 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 NA
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
Outputting first 5 instances for simple visual inspection. Initial observation, TEAM_BATTING_HBP is NA (not available) for each instance. Will need further investigation.
# summary
summary(mb_train_init)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0
## Median :1270.5 Median : 82.00 Median :1454 Median :238.0
## Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2
## 3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0
## Median : 47.00 Median :102.00 Median :512.0 Median : 750.0
## Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6
## 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0
## NA's :102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137
## 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419
## Median :101.0 Median : 49.0 Median :58.00 Median : 1518
## Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779
## 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682
## Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132
## NA's :131 NA's :772 NA's :2085
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0
## Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0
## Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5
## 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
## NA's :102
## TEAM_FIELDING_DP
## Min. : 52.0
## 1st Qu.:131.0
## Median :149.0
## Mean :146.4
## 3rd Qu.:164.0
## Max. :228.0
## NA's :286
Display summary of all the variables (statistics)
# More Visualize
ggpairs(mb_train_init[,3:7], color="gray20")
## Warning in warn_if_args_exist(list(...)): Extra arguments: 'color' are
## being ignored. If these are meant to be aesthetics, submit them using the
## 'mapping' variable within ggpairs with ggplot2::aes or ggplot2::aes_string.
Not much value in above visualization
p1 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_H, y=TARGET_WINS)) +
geom_point() +
geom_smooth(method=lm) +
labs(x = "Batting Hits", y = "Wins", title="Wins by Total Hits")
# Scatterplot with density plot
ggMarginal(p1, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot with boxplot
ggMarginal(p1, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
p2 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_2B, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Batting Doubles", y = "Wins", title="Wins by Doubles")
# Scatterplot with density plot
ggMarginal(p2, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot with boxplot
ggMarginal(p2, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
p3 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_3B, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Batting Triples", y = "Wins", title="Wins by Triples")
# Scatterplot with density plot
ggMarginal(p3, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot with boxplot
ggMarginal(p3, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
p4 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_HR, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Batting Home Runs", y = "Wins", title="Wins by Home Runs")
# Scatterplot with density plot
ggMarginal(p4, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot with boxplot
ggMarginal(p4, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
p5 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_BB, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Batting Walks", y = "Wins", title="Wins by Walks")
# Scatterplot with density plot
ggMarginal(p5, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot with boxplot
ggMarginal(p5, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
p6 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_SO, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Batting Strikeouts", y = "Wins", title="Wins by Strikeouts")
# Scatterplot with density plot
ggMarginal(p6, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
# Scatterplot with boxplot
ggMarginal(p6, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
p7 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_HBP, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Batting HBP", y = "Wins", title="Wins by HBP")
# Scatterplot with density plot
ggMarginal(p7, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## Warning: Removed 2085 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## Warning: Removed 2085 rows containing missing values (geom_point).
# Scatterplot with boxplot
ggMarginal(p7, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## Warning: Removed 2085 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## Warning: Removed 2085 rows containing missing values (geom_point).
p8 <- ggplot(mb_train_init, aes(x=TEAM_BASERUN_SB, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Base running Stolen Bases", y = "Wins", title="Wins by Stolen Bases")
# Scatterplot with density plot
ggMarginal(p8, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## Warning: Removed 131 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## Warning: Removed 131 rows containing missing values (geom_point).
# Scatterplot with boxplot
ggMarginal(p8, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## Warning: Removed 131 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## Warning: Removed 131 rows containing missing values (geom_point).
p9 <- ggplot(mb_train_init, aes(x=TEAM_BASERUN_CS, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Base running Caught Stealing", y = "Wins", title="Wins by Caught Stealing")
# Scatterplot with density plot
ggMarginal(p9, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).
## Warning: Removed 772 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).
## Warning: Removed 772 rows containing missing values (geom_point).
# Scatterplot with boxplot
ggMarginal(p9, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).
## Warning: Removed 772 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).
## Warning: Removed 772 rows containing missing values (geom_point).
p10 <- ggplot(mb_train_init, aes(x=TEAM_PITCHING_H, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Pitching Hits Allowed", y = "Wins", title="Wins by Hits Allowed")
# Scatterplot with density plot
ggMarginal(p10, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot with boxplot
ggMarginal(p10, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
p11 <- ggplot(mb_train_init, aes(x=TEAM_PITCHING_HR, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Pitching Home Runs Allowed", y = "Wins", title="Wins by Home Runs Allowed")
# Scatterplot with density plot
ggMarginal(p11, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot with boxplot
ggMarginal(p11, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
p12 <- ggplot(mb_train_init, aes(x=TEAM_PITCHING_BB, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Pitching Walks Allowed", y = "Wins", title="Wins by Walks Allowed")
# Scatterplot with density plot
ggMarginal(p12, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot with boxplot
ggMarginal(p12, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
p13 <- ggplot(mb_train_init, aes(x=TEAM_PITCHING_SO, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Pitching Strikeouts", y = "Wins", title="Wins by Strikeouts")
# Scatterplot with density plot
ggMarginal(p13, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
# Scatterplot with boxplot
ggMarginal(p13, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
p14 <- ggplot(mb_train_init, aes(x=TEAM_FIELDING_E, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Fielding Errors", y = "Wins", title="Wins by Errors Commited")
# Scatterplot with density plot
ggMarginal(p14, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot with boxplot
ggMarginal(p14, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
p15 <- ggplot(mb_train_init, aes(x=TEAM_FIELDING_DP, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm) +
labs(x = "Fielding Double Plays", y = "Wins", title="Wins by Defensive Double Plays")
# Scatterplot with density plot
ggMarginal(p15, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).
## Warning: Removed 286 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).
## Warning: Removed 286 rows containing missing values (geom_point).
# Scatterplot with boxplot
ggMarginal(p15, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).
## Warning: Removed 286 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).
## Warning: Removed 286 rows containing missing values (geom_point).
# Consider all hits as one variable: total bases
# First, identify the number of singles as Hits represents all hit types combined
mb_train_init$TEAM_BATTING_1B <- mb_train_init$TEAM_BATTING_H - mb_train_init$TEAM_BATTING_2B - mb_train_init$TEAM_BATTING_3B - mb_train_init$TEAM_BATTING_HR
mb_train_init$TOTAL_BASES <- mb_train_init$TEAM_BATTING_1B
mb_train_init$TOTAL_BASES <- (mb_train_init$TEAM_BATTING_2B * 2) + mb_train_init$TOTAL_BASES
mb_train_init$TOTAL_BASES <- (mb_train_init$TEAM_BATTING_3B * 3) + mb_train_init$TOTAL_BASES
mb_train_init$TOTAL_BASES <- (mb_train_init$TEAM_BATTING_HR * 4) + mb_train_init$TOTAL_BASES
head(mb_train_init)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 1 39 1445 194 39
## 2 2 70 1339 219 22
## 3 3 86 1377 232 35
## 4 4 70 1387 209 38
## 5 5 82 1297 186 27
## 6 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 NA
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 NA NA 9364 84
## 2 28 NA 1347 191
## 3 27 NA 1377 137
## 4 30 NA 1396 97
## 5 39 NA 1297 102
## 6 59 NA 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 NA
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
## TEAM_BATTING_1B TOTAL_BASES
## 1 1199 1756
## 2 908 2172
## 3 973 2090
## 4 1044 1960
## 5 982 1843
## 6 951 1827
ggplot(mb_train_init, aes(x=TOTAL_BASES, y=TARGET_WINS)) +
geom_point()+
geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'
# How many instances have 0 wins
zero_wins <- subset(mb_train_init, TARGET_WINS == 0)
head(zero_wins)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1211 1347 0 891 135 0
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1211 0 0 0 0
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1211 0 NA 24057 0
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1211 0 0 1890 NA
## TEAM_BATTING_1B TOTAL_BASES
## 1211 756 1026
# Appears to be 1 instance has zero wins
# Looking at the data, appears bogus
# Remove really bad pitching
bad_pitching <-subset(mb_train_init, TEAM_PITCHING_H > 2000)
summary(bad_pitching)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 1 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 462 1st Qu.: 71.00 1st Qu.:1450 1st Qu.:206.0
## Median :1188 Median : 87.00 Median :1576 Median :237.0
## Mean :1236 Mean : 82.08 Mean :1609 Mean :242.7
## 3rd Qu.:2035 3rd Qu.:100.00 3rd Qu.:1712 3rd Qu.:280.0
## Max. :2535 Max. :146.00 Max. :2554 Max. :458.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 51.00 1st Qu.: 15.00 1st Qu.:174.0 1st Qu.: 319.0
## Median : 82.00 Median : 33.00 Median :276.0 Median : 456.0
## Mean : 81.26 Mean : 50.32 Mean :341.4 Mean : 497.6
## 3rd Qu.:107.00 3rd Qu.: 62.00 3rd Qu.:523.0 3rd Qu.: 685.0
## Max. :223.00 Max. :239.00 Max. :819.0 Max. :1264.0
##
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## Min. : 0.0 Min. : 0.00 Min. : NA Min. : 2003
## 1st Qu.:107.2 1st Qu.: 43.00 1st Qu.: NA 1st Qu.: 2130
## Median :198.5 Median : 54.00 Median : NA Median : 2412
## Mean :221.2 Mean : 53.03 Mean :NaN Mean : 3810
## 3rd Qu.:318.0 3rd Qu.: 67.00 3rd Qu.: NA 3rd Qu.: 3861
## Max. :697.0 Max. :118.00 Max. : NA Max. :30132
## NA's :123 NA's :192 NA's :257
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 82.0
## 1st Qu.: 33.0 1st Qu.: 380.0 1st Qu.: 475.0 1st Qu.: 443.0
## Median : 55.0 Median : 553.0 Median : 845.0 Median : 660.0
## Mean : 81.4 Mean : 613.6 Mean : 986.2 Mean : 674.9
## 3rd Qu.: 97.0 3rd Qu.: 752.0 3rd Qu.: 1156.0 3rd Qu.: 894.0
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
##
## TEAM_FIELDING_DP TEAM_BATTING_1B TOTAL_BASES
## Min. : 52.0 Min. : 709 Min. :1026
## 1st Qu.: 97.0 1st Qu.:1068 1st Qu.:1934
## Median :127.0 Median :1208 Median :2159
## Mean :124.5 Mean :1234 Mean :2165
## 3rd Qu.:149.0 3rd Qu.:1356 3rd Qu.:2376
## Max. :201.0 Max. :2112 Max. :3290
## NA's :156
dim(bad_pitching)
## [1] 257 19
# Result is 257 entries, that seems like a lot of bad pitching
# Most wins in a season is 116, so let's see how many instance exist above that number
too_many_wins <- subset(mb_train_init, TARGET_WINS > 116)
summary(too_many_wins)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 323 Min. :117.0 Min. :1403 Min. :193.0
## 1st Qu.: 462 1st Qu.:118.0 1st Qu.:1561 1st Qu.:221.0
## Median : 492 Median :122.0 Median :1689 Median :280.0
## Mean :1112 Mean :124.5 Mean :1867 Mean :286.4
## 3rd Qu.:2034 3rd Qu.:128.0 3rd Qu.:2273 3rd Qu.:322.0
## Max. :2250 Max. :146.0 Max. :2554 Max. :393.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 41.0 Min. : 8.00 Min. : 64.0 Min. : 0.0
## 1st Qu.: 76.0 1st Qu.: 22.00 1st Qu.:170.0 1st Qu.: 78.0
## Median :108.0 Median : 29.00 Median :266.0 Median :419.0
## Mean :100.5 Mean : 41.88 Mean :313.6 Mean :349.2
## 3rd Qu.:119.0 3rd Qu.: 46.00 3rd Qu.:477.0 3rd Qu.:645.5
## Max. :156.0 Max. :164.00 Max. :670.0 Max. :777.0
## NA's :2
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## Min. : 32.0 Min. :35.00 Min. : NA Min. : 1495
## 1st Qu.:148.2 1st Qu.:41.50 1st Qu.: NA 1st Qu.: 2066
## Median :228.0 Median :48.00 Median : NA Median : 2570
## Mean :207.6 Mean :52.33 Mean :NaN Mean : 4112
## 3rd Qu.:284.0 3rd Qu.:61.00 3rd Qu.: NA 3rd Qu.: 5253
## Max. :324.0 Max. :74.00 Max. : NA Max. :13724
## NA's :9 NA's :14 NA's :17
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. : 15.00 Min. : 131.0 Min. : 0.0 Min. : 135.0
## 1st Qu.: 33.00 1st Qu.: 371.0 1st Qu.: 225.5 1st Qu.: 479.0
## Median : 59.00 Median : 467.0 Median : 652.0 Median : 692.0
## Mean : 79.12 Mean : 521.9 Mean : 547.0 Mean : 678.6
## 3rd Qu.: 97.00 3rd Qu.: 628.0 3rd Qu.: 841.5 3rd Qu.: 928.0
## Max. :301.00 Max. :1539.0 Max. :1114.0 Max. :1192.0
## NA's :2
## TEAM_FIELDING_DP TEAM_BATTING_1B TOTAL_BASES
## Min. : 79.0 Min. :1036 Min. :1811
## 1st Qu.: 86.0 1st Qu.:1115 1st Qu.:2148
## Median :104.0 Median :1291 Median :2292
## Mean :105.2 Mean :1438 Mean :2480
## 3rd Qu.:107.0 3rd Qu.:1795 3rd Qu.:2936
## Max. :156.0 Max. :2016 Max. :3290
## NA's :11
dim(too_many_wins)
## [1] 17 19
# Answer is 17
# Note: I have not included BATTING HBP because this causes over 2000 observations to be removed
model <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR +
TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train_init)
summary(model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = mb_train_init)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.5627 -6.6932 -0.1328 6.5249 27.8525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.912438 6.642839 8.718 < 2e-16 ***
## TEAM_BATTING_1B 0.015434 0.019626 0.786 0.4318
## TEAM_BATTING_2B -0.055039 0.020556 -2.677 0.0075 **
## TEAM_BATTING_3B 0.176985 0.028193 6.278 4.52e-10 ***
## TEAM_BATTING_HR 0.089385 0.090409 0.989 0.3230
## TEAM_BATTING_BB 0.043765 0.046454 0.942 0.3463
## TEAM_BATTING_SO 0.018250 0.023463 0.778 0.4368
## TEAM_BASERUN_SB 0.035880 0.008687 4.130 3.83e-05 ***
## TEAM_BASERUN_CS 0.052124 0.018227 2.860 0.0043 **
## TEAM_PITCHING_H 0.019044 0.018381 1.036 0.3003
## TEAM_PITCHING_HR 0.022997 0.082092 0.280 0.7794
## TEAM_PITCHING_BB -0.004180 0.044692 -0.094 0.9255
## TEAM_PITCHING_SO -0.038176 0.022447 -1.701 0.0892 .
## TEAM_FIELDING_E -0.155876 0.009946 -15.672 < 2e-16 ***
## TEAM_FIELDING_DP -0.112885 0.013137 -8.593 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.556 on 1471 degrees of freedom
## (790 observations deleted due to missingness)
## Multiple R-squared: 0.4386, Adjusted R-squared: 0.4333
## F-statistic: 82.1 on 14 and 1471 DF, p-value: < 2.2e-16
sigma(model) / mean(mb_train_init$TARGET_WINS)
## [1] 0.1182788
# Note: Only include significant variables
model_sig <- lm(TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train_init)
summary(model_sig)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = mb_train_init)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.687 -7.955 -0.154 8.008 37.873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.152078 3.534132 26.075 < 2e-16 ***
## TEAM_BATTING_2B 0.024128 0.007505 3.215 0.00133 **
## TEAM_BATTING_3B 0.269509 0.021720 12.408 < 2e-16 ***
## TEAM_BASERUN_SB 0.019670 0.010031 1.961 0.05008 .
## TEAM_BASERUN_CS 0.002149 0.021423 0.100 0.92012
## TEAM_FIELDING_E -0.162860 0.011250 -14.477 < 2e-16 ***
## TEAM_FIELDING_DP -0.048528 0.015288 -3.174 0.00153 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.47 on 1479 degrees of freedom
## (790 observations deleted due to missingness)
## Multiple R-squared: 0.1875, Adjusted R-squared: 0.1842
## F-statistic: 56.89 on 6 and 1479 DF, p-value: < 2.2e-16
sigma(model_sig) / mean(mb_train_init$TARGET_WINS)
## [1] 0.1419097
# Replace all hits as total bases
model_tb <- lm(TARGET_WINS ~ TOTAL_BASES +
TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train_init)
summary(model_tb)
##
## Call:
## lm(formula = TARGET_WINS ~ TOTAL_BASES + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = mb_train_init)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.988 -6.708 0.014 6.523 29.819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.338873 6.504527 10.814 < 2e-16 ***
## TOTAL_BASES -0.004803 0.006700 -0.717 0.473567
## TEAM_BATTING_BB 0.068653 0.035606 1.928 0.054032 .
## TEAM_BATTING_SO 0.019373 0.020869 0.928 0.353390
## TEAM_BASERUN_SB 0.032802 0.009004 3.643 0.000279 ***
## TEAM_BASERUN_CS 0.070570 0.018849 3.744 0.000188 ***
## TEAM_PITCHING_H 0.027246 0.009711 2.806 0.005086 **
## TEAM_PITCHING_HR 0.100054 0.019899 5.028 5.56e-07 ***
## TEAM_PITCHING_BB -0.031004 0.034030 -0.911 0.362401
## TEAM_PITCHING_SO -0.047653 0.019806 -2.406 0.016252 *
## TEAM_FIELDING_E -0.125710 0.009863 -12.745 < 2e-16 ***
## TEAM_FIELDING_DP -0.109698 0.013625 -8.051 1.68e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.927 on 1474 degrees of freedom
## (790 observations deleted due to missingness)
## Multiple R-squared: 0.393, Adjusted R-squared: 0.3884
## F-statistic: 86.74 on 11 and 1474 DF, p-value: < 2.2e-16
# Did not prove helpful, but leave for now
sigma(model_tb) / mean(mb_train_init$TARGET_WINS)
## [1] 0.1228704
The lower the RSE, the more accurate the model
Overall, the F-statistic p-value: < 2.2e-16, which is highly significant
Added all the potential predictor variables, so the results show these variables are significant based on checking to see if t-value is significantly different from zero … those that match significance
TEAM_FIELDING_E -0.17204 0.04140 -4.155 5.08e-05 * TEAM_FIELDING_DP -0.10819 0.03654 -2.961 0.00349 TEAM_BATTING_3B -0.10118 0.07751 -1.305 0.19348 TEAM_BATTING_BB -4.45969 3.63624 -1.226 0.22167 TEAM_BASERUN_SB 0.03304 0.02867 1.152 0.25071 TEAM_BATTING_HBP 0.08247 0.04960 1.663 0.09815 TEAM_PITCHING_BB 4.51089 3.63372 1.241 0.21612
# Drop column for Batting HBP
mb_train_init_clean <- subset(mb_train_init, select = -c(TEAM_BATTING_HBP))
# Remove entries with too many wins
mb_train_init_clean <- subset(mb_train_init_clean, TARGET_WINS <= 116)
# Remove entries with zero wins (1 total)
mb_train_init_clean <- subset(mb_train_init_clean, TARGET_WINS != 0)
# Remove entries with too many hits allowed
mb_train_init_clean <- subset(mb_train_init_clean, TEAM_PITCHING_H < 2000)
summary(mb_train_init_clean)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 2.0 Min. : 21.00 Min. :1137 Min. :118.0
## 1st Qu.: 648.5 1st Qu.: 71.00 1st Qu.:1377 1st Qu.:208.0
## Median :1279.0 Median : 82.00 Median :1446 Median :239.0
## Mean :1272.6 Mean : 80.55 Mean :1451 Mean :241.1
## 3rd Qu.:1902.5 3rd Qu.: 91.00 3rd Qu.:1523 3rd Qu.:272.0
## Max. :2534.0 Max. :116.00 Max. :1876 Max. :392.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 11.00 Min. : 3.0 Min. :171.0 Min. : 268.0
## 1st Qu.: 33.00 1st Qu.: 54.0 1st Qu.:461.5 1st Qu.: 579.0
## Median : 45.00 Median :109.0 Median :519.0 Median : 791.0
## Mean : 51.88 Mean :106.0 Mean :521.9 Mean : 767.7
## 3rd Qu.: 66.00 3rd Qu.:149.5 3rd Qu.:583.0 3rd Qu.: 943.0
## Max. :147.00 Max. :264.0 Max. :878.0 Max. :1399.0
## NA's :100
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_H TEAM_PITCHING_HR
## Min. : 18.0 Min. : 11.00 Min. :1137 Min. : 3.0
## 1st Qu.: 65.0 1st Qu.: 38.00 1st Qu.:1408 1st Qu.: 58.0
## Median : 99.0 Median : 49.00 Median :1494 Median :112.0
## Mean :118.2 Mean : 52.81 Mean :1521 Mean :108.9
## 3rd Qu.:149.0 3rd Qu.: 62.00 3rd Qu.:1608 3rd Qu.:152.0
## Max. :654.0 Max. :201.00 Max. :1999 Max. :264.0
## NA's :8 NA's :577
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## Min. :247.0 Min. : 301.0 Min. : 65.0 Min. : 68.0
## 1st Qu.:482.0 1st Qu.: 626.0 1st Qu.:125.0 1st Qu.:133.0
## Median :536.0 Median : 811.0 Median :153.0 Median :149.0
## Mean :545.3 Mean : 795.3 Mean :191.9 Mean :147.6
## 3rd Qu.:601.5 3rd Qu.: 953.5 3rd Qu.:212.0 3rd Qu.:164.0
## Max. :929.0 Max. :1659.0 Max. :796.0 Max. :228.0
## NA's :100 NA's :130
## TEAM_BATTING_1B TOTAL_BASES
## Min. : 811 Min. :1453
## 1st Qu.: 985 1st Qu.:1950
## Median :1042 Median :2123
## Mean :1052 Mean :2114
## 3rd Qu.:1109 3rd Qu.:2276
## Max. :1458 Max. :2832
##
dim(mb_train_init_clean)
## [1] 2015 18
# https://machinelearningmastery.com/machine-learning-in-r-step-by-step/
# create a list of 80% of the rows in the original dataset we can use for training
validation_index <- createDataPartition(mb_train_init_clean$TARGET_WINS, p=0.80, list=FALSE)
# select 20% of the data for validation
mb_valid_clean <- mb_train_init_clean[-validation_index,]
# use the remaining 80% of data to training and testing the models
mb_train_clean <- mb_train_init_clean[validation_index,]
model_clean <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR +
TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train_clean,
na.action = na.omit)
summary(model_clean)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = mb_train_clean, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.8782 -6.5006 -0.0018 6.4110 28.9285
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.201065 7.641781 7.616 5.54e-14 ***
## TEAM_BATTING_1B 0.048577 0.048556 1.000 0.317311
## TEAM_BATTING_2B -0.008746 0.049586 -0.176 0.860020
## TEAM_BATTING_3B 0.203229 0.056923 3.570 0.000372 ***
## TEAM_BATTING_HR -0.217925 0.241232 -0.903 0.366517
## TEAM_BATTING_BB 0.100544 0.111822 0.899 0.368771
## TEAM_BATTING_SO -0.015440 0.053202 -0.290 0.771701
## TEAM_BASERUN_SB 0.027806 0.010283 2.704 0.006952 **
## TEAM_BASERUN_CS 0.066262 0.021038 3.150 0.001678 **
## TEAM_PITCHING_H -0.016064 0.046810 -0.343 0.731525
## TEAM_PITCHING_HR 0.352839 0.221205 1.595 0.110977
## TEAM_PITCHING_BB -0.059293 0.108798 -0.545 0.585878
## TEAM_PITCHING_SO -0.006510 0.051740 -0.126 0.899893
## TEAM_FIELDING_E -0.146791 0.011822 -12.416 < 2e-16 ***
## TEAM_FIELDING_DP -0.105609 0.015068 -7.009 4.14e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.594 on 1120 degrees of freedom
## (479 observations deleted due to missingness)
## Multiple R-squared: 0.4191, Adjusted R-squared: 0.4118
## F-statistic: 57.71 on 14 and 1120 DF, p-value: < 2.2e-16
sig <- sigma(model_clean)
sig
## [1] 9.593717
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.1190831
# STEP 1
# Removed TEAM_BATTING_SO
model_clean_st1 <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR +
TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train_clean,
na.action = na.omit)
summary(model_clean_st1)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train_clean,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.8526 -6.5244 0.0445 6.4400 28.9837
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.306977 7.629944 7.642 4.58e-14 ***
## TEAM_BATTING_1B 0.040793 0.040458 1.008 0.31354
## TEAM_BATTING_2B -0.016786 0.041109 -0.408 0.68310
## TEAM_BATTING_3B 0.194184 0.047613 4.078 4.86e-05 ***
## TEAM_BATTING_HR -0.261124 0.189758 -1.376 0.16907
## TEAM_BATTING_BB 0.103590 0.111283 0.931 0.35212
## TEAM_BASERUN_SB 0.027902 0.010273 2.716 0.00671 **
## TEAM_BASERUN_CS 0.066802 0.020947 3.189 0.00147 **
## TEAM_PITCHING_H -0.008487 0.038836 -0.219 0.82705
## TEAM_PITCHING_HR 0.387751 0.185562 2.090 0.03688 *
## TEAM_PITCHING_BB -0.062413 0.108221 -0.577 0.56425
## TEAM_PITCHING_SO -0.021504 0.002810 -7.652 4.24e-14 ***
## TEAM_FIELDING_E -0.147181 0.011741 -12.536 < 2e-16 ***
## TEAM_FIELDING_DP -0.105623 0.015062 -7.013 4.04e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.59 on 1121 degrees of freedom
## (479 observations deleted due to missingness)
## Multiple R-squared: 0.419, Adjusted R-squared: 0.4123
## F-statistic: 62.19 on 13 and 1121 DF, p-value: < 2.2e-16
sig <- sigma(model_clean_st1)
sig
## [1] 9.589798
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.1190345
# STEP 2
# Removed TEAM_PITCHING_H
model_clean_st2 <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR +
TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train_clean,
na.action = na.omit)
summary(model_clean_st2)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train_clean,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.8110 -6.5213 0.0232 6.4470 29.0552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.445959 7.600165 7.690 3.20e-14 ***
## TEAM_BATTING_1B 0.032029 0.005341 5.997 2.70e-09 ***
## TEAM_BATTING_2B -0.025587 0.008253 -3.100 0.00198 **
## TEAM_BATTING_3B 0.185187 0.023914 7.744 2.14e-14 ***
## TEAM_BATTING_HR -0.264818 0.188923 -1.402 0.16128
## TEAM_BATTING_BB 0.125298 0.050147 2.499 0.01261 *
## TEAM_BASERUN_SB 0.028118 0.010221 2.751 0.00604 **
## TEAM_BASERUN_CS 0.066391 0.020853 3.184 0.00149 **
## TEAM_PITCHING_HR 0.382881 0.184140 2.079 0.03782 *
## TEAM_PITCHING_BB -0.083558 0.048453 -1.725 0.08489 .
## TEAM_PITCHING_SO -0.021550 0.002801 -7.693 3.14e-14 ***
## TEAM_FIELDING_E -0.147510 0.011639 -12.674 < 2e-16 ***
## TEAM_FIELDING_DP -0.105432 0.015030 -7.015 3.98e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.586 on 1122 degrees of freedom
## (479 observations deleted due to missingness)
## Multiple R-squared: 0.419, Adjusted R-squared: 0.4128
## F-statistic: 67.43 on 12 and 1122 DF, p-value: < 2.2e-16
sig <- sigma(model_clean_st2)
sig
## [1] 9.585728
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.118984
AIC(model_clean_st2)
BIC(model_clean_st2)
# STEP 3
# Removed TEAM_BATTING_HR
model_clean_st3 <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train_clean,
na.action = na.omit)
summary(model_clean_st3)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train_clean,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.7825 -6.6518 -0.1376 6.3787 28.0915
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.295477 7.602670 7.668 3.78e-14 ***
## TEAM_BATTING_1B 0.032529 0.005331 6.102 1.44e-09 ***
## TEAM_BATTING_2B -0.028519 0.007987 -3.571 0.000371 ***
## TEAM_BATTING_3B 0.185136 0.023924 7.738 2.23e-14 ***
## TEAM_BATTING_BB 0.058641 0.015925 3.682 0.000242 ***
## TEAM_BASERUN_SB 0.029877 0.010148 2.944 0.003306 **
## TEAM_BASERUN_CS 0.063748 0.020777 3.068 0.002205 **
## TEAM_PITCHING_HR 0.125084 0.009129 13.703 < 2e-16 ***
## TEAM_PITCHING_BB -0.018832 0.014685 -1.282 0.199963
## TEAM_PITCHING_SO -0.021018 0.002777 -7.570 7.78e-14 ***
## TEAM_FIELDING_E -0.149394 0.011566 -12.917 < 2e-16 ***
## TEAM_FIELDING_DP -0.104712 0.015028 -6.968 5.47e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.59 on 1123 degrees of freedom
## (479 observations deleted due to missingness)
## Multiple R-squared: 0.418, Adjusted R-squared: 0.4123
## F-statistic: 73.32 on 11 and 1123 DF, p-value: < 2.2e-16
sig <- sigma(model_clean_st3)
sig
## [1] 9.589845
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.1190351
Continually the best result is Step 3
# STEP 4
# Removed TEAM_PITCHING_BB
model_clean_st4 <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train_clean,
na.action = na.omit)
summary(model_clean_st4)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
## TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = mb_train_clean, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.8522 -6.5628 -0.1001 6.4270 27.9040
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.935980 7.496424 7.995 3.19e-15 ***
## TEAM_BATTING_1B 0.032178 0.005325 6.042 2.06e-09 ***
## TEAM_BATTING_2B -0.028138 0.007983 -3.525 0.000441 ***
## TEAM_BATTING_3B 0.178552 0.023374 7.639 4.67e-14 ***
## TEAM_BATTING_BB 0.038822 0.003845 10.097 < 2e-16 ***
## TEAM_BASERUN_SB 0.031367 0.010084 3.110 0.001915 **
## TEAM_BASERUN_CS 0.062814 0.020770 3.024 0.002549 **
## TEAM_PITCHING_HR 0.124755 0.009128 13.668 < 2e-16 ***
## TEAM_PITCHING_SO -0.021315 0.002768 -7.701 2.95e-14 ***
## TEAM_FIELDING_E -0.151741 0.011424 -13.283 < 2e-16 ***
## TEAM_FIELDING_DP -0.106320 0.014980 -7.098 2.24e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.593 on 1124 degrees of freedom
## (479 observations deleted due to missingness)
## Multiple R-squared: 0.4171, Adjusted R-squared: 0.4119
## F-statistic: 80.44 on 10 and 1124 DF, p-value: < 2.2e-16
sig <- sigma(model_clean_st4)
sig
## [1] 9.592594
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.1190692
plot(model_clean_st4)
# Graph residuals
ggplot(data=model_clean_st3, aes(model_clean_st3$residuals)) +
geom_histogram(binwidth = 1, color = "black", fill = "purple4") +
theme(panel.background = element_rect(fill = "white"),
axis.line.x=element_line(),
axis.line.y=element_line()) +
ggtitle("Histogram for Model Residuals")
# Run the step wise approach using the stepAIC function
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
stepmodel <- stepAIC(model_clean, direction = c("both"), trace = FALSE)
summary(stepmodel)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_3B +
## TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
## TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = mb_train_clean, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.8918 -6.6459 -0.1023 6.5008 28.1725
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.313645 7.578559 7.695 3.09e-14 ***
## TEAM_BATTING_1B 0.056888 0.007474 7.612 5.72e-14 ***
## TEAM_BATTING_3B 0.211683 0.025403 8.333 2.28e-16 ***
## TEAM_BATTING_BB 0.039484 0.003847 10.263 < 2e-16 ***
## TEAM_BATTING_SO -0.022025 0.002811 -7.836 1.07e-14 ***
## TEAM_BASERUN_SB 0.028665 0.010062 2.849 0.00447 **
## TEAM_BASERUN_CS 0.064731 0.020712 3.125 0.00182 **
## TEAM_PITCHING_H -0.024379 0.004864 -5.012 6.25e-07 ***
## TEAM_PITCHING_HR 0.148530 0.011372 13.061 < 2e-16 ***
## TEAM_FIELDING_E -0.147211 0.011390 -12.924 < 2e-16 ***
## TEAM_FIELDING_DP -0.104964 0.014991 -7.002 4.35e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.582 on 1124 degrees of freedom
## (479 observations deleted due to missingness)
## Multiple R-squared: 0.4184, Adjusted R-squared: 0.4132
## F-statistic: 80.87 on 10 and 1124 DF, p-value: < 2.2e-16
sig <- sigma(stepmodel)
sig
## [1] 9.582
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.1189377
plot(stepmodel)
https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/
Using package MICE
library(mice)
## Warning: package 'mice' was built under R version 3.6.2
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
# SKIP THIS SECTION
# Impute the missing data
mb_train_init_imputed <- mice(mb_train_init, m=5, maxit=50, method='pmm', seed=500)
summary(mb_train_init_imputed)
mb_train_imp_2 <- complete(mb_train_init_imputed,2)
# create a list of 80% of the rows in the original dataset we can use for training
validation_index <- createDataPartition(mb_train_imp_2$TARGET_WINS, p=0.80, list=FALSE)
# select 20% of the data for validation
mb_valid_imp <- mb_train_imp_2[-validation_index,]
# use the remaining 80% of data to training and testing the models
mb_train_imp <- mb_train_imp_2[validation_index,]
# output one of the imputed dataframes
complete_data_2 <- complete(mb_train_init_imputed,2)
head(complete_data_2)
head(mb_train_init)
# build predictive model
fit <- with(data=mb_train_init_imputed, exp = lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB +
TEAM_BASERUN_CS + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP,
data=mb_train_clean,
na.action = na.omit))
summary(fit)
# combine results of all 5 models
pooled <- pool(fit)
summary(pooled)
# https://stackoverflow.com/questions/52713733/how-to-use-predict-function-with-my-pooled-results-from-mice
# Copy one of the fitted lm models fit to
# one of the imputed datasets
pooled_lm = fit$analyses[[1]]
# Replace the fitted coefficients with the pooled
# estimates (need to check they are replaced in
# the correct order)
pooled_lm$coefficients = summary(pooled)$estimate
# Predict - predictions seem to match the
# pooled coefficients rather than the original
# lm that was copied
# predict(fit$analyses[[1]], newdata = nhanes)
wins_pred_imp <- predict(pooled_lm, newdata = mb_valid_imp)
wins_pred_imp
actual_preds_imp <- data.frame(cbind(actuals=mb_valid_imp$TARGET_WINS, predicteds=wins_pred_imp))
# actual_preds_imp <- subset(actual_preds_imp, predicteds > 0)
actual_preds_imp
correlation_accuracy <- cor(actual_preds_imp)
correlation_accuracy
mape <- MAPE(actual_preds_imp$predicteds, actual_preds_imp$actuals)
# lower is better
# mape 0.1397487 after removal of negative wins
# 0.3172925 with the negative scores
mape
# Read in CSV file of evaluation data
mb_eval <- read.csv("moneyball-evaluation-data.csv")
# Dimensions of the dataset
dim(mb_eval)
## [1] 259 16
# list types of each attribute
sapply(mb_eval, class)
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## "integer" "integer" "integer" "integer"
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## "integer" "integer" "integer" "integer"
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## "integer" "integer" "integer" "integer"
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## "integer" "integer" "integer" "integer"
# first five rows
head(mb_eval)
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1 9 1209 170 33 83
## 2 10 1221 151 29 88
## 3 14 1395 183 29 93
## 4 47 1539 309 29 159
## 5 60 1445 203 68 5
## 6 63 1431 236 53 10
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1 447 1080 62 50
## 2 516 929 54 39
## 3 509 816 59 47
## 4 486 914 148 57
## 5 95 416 NA NA
## 6 215 377 NA NA
## TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 1 NA 1209 83 447
## 2 NA 1221 88 516
## 3 NA 1395 93 509
## 4 42 1539 159 486
## 5 NA 3902 14 257
## 6 NA 2793 20 420
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 1080 140 156
## 2 929 135 164
## 3 816 156 153
## 4 914 124 154
## 5 1123 616 130
## 6 736 572 105
# summary
summary(mb_eval)
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## Min. : 9 Min. : 819 Min. : 44.0 Min. : 14.00
## 1st Qu.: 708 1st Qu.:1387 1st Qu.:210.0 1st Qu.: 35.00
## Median :1249 Median :1455 Median :239.0 Median : 52.00
## Mean :1264 Mean :1469 Mean :241.3 Mean : 55.91
## 3rd Qu.:1832 3rd Qu.:1548 3rd Qu.:278.5 3rd Qu.: 72.00
## Max. :2525 Max. :2170 Max. :376.0 Max. :155.00
##
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## Min. : 0.00 Min. : 15.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 44.50 1st Qu.:436.5 1st Qu.: 545.0 1st Qu.: 59.0
## Median :101.00 Median :509.0 Median : 686.0 Median : 92.0
## Mean : 95.63 Mean :499.0 Mean : 709.3 Mean :123.7
## 3rd Qu.:135.50 3rd Qu.:565.5 3rd Qu.: 912.0 3rd Qu.:151.8
## Max. :242.00 Max. :792.0 Max. :1268.0 Max. :580.0
## NA's :18 NA's :13
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## Min. : 0.00 Min. :42.00 Min. : 1155 Min. : 0.0
## 1st Qu.: 38.00 1st Qu.:53.50 1st Qu.: 1426 1st Qu.: 52.0
## Median : 49.50 Median :62.00 Median : 1515 Median :104.0
## Mean : 52.32 Mean :62.37 Mean : 1813 Mean :102.1
## 3rd Qu.: 63.00 3rd Qu.:67.50 3rd Qu.: 1681 3rd Qu.:142.5
## Max. :154.00 Max. :96.00 Max. :22768 Max. :336.0
## NA's :87 NA's :240
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## Min. : 136.0 Min. : 0.0 Min. : 73.0 Min. : 69.0
## 1st Qu.: 471.0 1st Qu.: 613.0 1st Qu.: 131.0 1st Qu.:131.0
## Median : 526.0 Median : 745.0 Median : 163.0 Median :148.0
## Mean : 552.4 Mean : 799.7 Mean : 249.7 Mean :146.1
## 3rd Qu.: 606.5 3rd Qu.: 938.0 3rd Qu.: 252.0 3rd Qu.:164.0
## Max. :2008.0 Max. :9963.0 Max. :1568.0 Max. :204.0
## NA's :18 NA's :31
# Impute the date for the missing validation data
# Impute the missing data
mb_valid_clean_imputed <- mice(mb_valid_clean, m=5, maxit=50, method='pmm', seed=500)
##
## iter imp variable
## 1 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 1 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 1 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 1 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 2 1 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 2 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 2 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 2 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 3 1 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 3 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 3 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 3 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 4 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 4 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 4 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 4 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 4 5 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 5 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 5 2 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 5 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 5 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 5 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 6 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 6 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 6 3 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 6 4 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 6 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 7 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 7 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 7 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 7 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 7 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 8 1 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 8 2 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 8 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 8 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 8 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 9 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 9 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 9 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 9 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 9 5 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 10 1 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 10 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 10 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 10 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 10 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 11 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 11 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 11 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 11 4 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 11 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 12 1 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 12 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 12 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 12 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 12 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 13 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 13 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 13 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 13 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 13 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 14 1 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 14 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 14 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 14 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 14 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 15 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 15 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 15 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 15 4 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 15 5 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 16 1 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 16 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 16 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 16 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 16 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 17 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 17 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 17 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 17 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 17 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 18 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 18 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 18 3 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 18 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 18 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 19 1 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 19 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 19 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 19 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 19 5 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 20 1 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 20 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 20 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 20 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 20 5 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 21 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 21 2 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 21 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 21 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 21 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 22 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 22 2 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 22 3 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 22 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 22 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 23 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 23 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 23 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 23 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 23 5 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 24 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 24 2 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 24 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 24 4 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 24 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 25 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 25 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 25 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 25 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 25 5 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 26 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 26 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 26 3 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 26 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 26 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 27 1 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 27 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 27 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 27 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 27 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 28 1 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 28 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 28 3 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 28 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 28 5 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 29 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 29 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 29 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 29 4 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 29 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 30 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 30 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 30 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 30 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 30 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 31 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 31 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 31 3 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 31 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 31 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 32 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 32 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 32 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 32 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 32 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 33 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 33 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 33 3 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 33 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 33 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 34 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 34 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 34 3 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 34 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 34 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 35 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 35 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 35 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 35 4 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 35 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 36 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 36 2 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 36 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 36 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 36 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 37 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 37 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 37 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 37 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 37 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 38 1 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 38 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 38 3 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 38 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 38 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 39 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 39 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 39 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 39 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 39 5 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 40 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 40 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 40 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 40 4 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 40 5 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 41 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 41 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 41 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 41 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 41 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 42 1 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 42 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 42 3 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 42 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 42 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 43 1 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 43 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 43 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 43 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 43 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 44 1 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 44 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 44 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 44 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 44 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 45 1 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 45 2 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 45 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 45 4 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 45 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 46 1 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 46 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 46 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 46 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 46 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 47 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 47 2 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 47 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 47 4 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 47 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 48 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 48 2 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 48 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 48 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 48 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 49 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP*
## 49 2 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 49 3 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 49 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 49 5 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## 50 1 TEAM_BATTING_SO* TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 50 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO TEAM_FIELDING_DP
## 50 3 TEAM_BATTING_SO* TEAM_BASERUN_SB TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 50 4 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP*
## 50 5 TEAM_BATTING_SO TEAM_BASERUN_SB* TEAM_BASERUN_CS* TEAM_PITCHING_SO* TEAM_FIELDING_DP
## * Please inspect the loggedEvents
## Warning: Number of logged events: 2008
summary(mb_valid_clean_imputed)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## "" "" "" ""
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## "" "" "" "pmm"
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_H TEAM_PITCHING_HR
## "pmm" "pmm" "" ""
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## "" "pmm" "" "pmm"
## TEAM_BATTING_1B TOTAL_BASES
## "" ""
## PredictorMatrix:
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## INDEX 0 1 1 1
## TARGET_WINS 1 0 1 1
## TEAM_BATTING_H 1 1 0 1
## TEAM_BATTING_2B 1 1 1 0
## TEAM_BATTING_3B 1 1 1 1
## TEAM_BATTING_HR 1 1 1 1
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
## INDEX 1 1 1
## TARGET_WINS 1 1 1
## TEAM_BATTING_H 1 1 1
## TEAM_BATTING_2B 1 1 1
## TEAM_BATTING_3B 0 1 1
## TEAM_BATTING_HR 1 0 1
## TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## INDEX 1 1 1
## TARGET_WINS 1 1 1
## TEAM_BATTING_H 1 1 1
## TEAM_BATTING_2B 1 1 1
## TEAM_BATTING_3B 1 1 1
## TEAM_BATTING_HR 1 1 1
## TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## INDEX 1 1 1
## TARGET_WINS 1 1 1
## TEAM_BATTING_H 1 1 1
## TEAM_BATTING_2B 1 1 1
## TEAM_BATTING_3B 1 1 1
## TEAM_BATTING_HR 1 1 1
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## INDEX 1 1 1
## TARGET_WINS 1 1 1
## TEAM_BATTING_H 1 1 1
## TEAM_BATTING_2B 1 1 1
## TEAM_BATTING_3B 1 1 1
## TEAM_BATTING_HR 1 1 1
## TEAM_BATTING_1B TOTAL_BASES
## INDEX 1 1
## TARGET_WINS 1 1
## TEAM_BATTING_H 1 1
## TEAM_BATTING_2B 1 1
## TEAM_BATTING_3B 1 1
## TEAM_BATTING_HR 1 1
## Number of logged events: 2008
## it im dep meth
## 1 1 1 TEAM_BATTING_SO pmm
## 2 1 1 TEAM_BATTING_SO pmm
## 3 1 1 TEAM_BASERUN_SB pmm
## 4 1 1 TEAM_BASERUN_SB pmm
## 5 1 1 TEAM_BASERUN_CS pmm
## 6 1 1 TEAM_BASERUN_CS pmm
## out
## 1 TEAM_BATTING_H, TEAM_BATTING_BB
## 2 * A ridge penalty had to be used to calculate the inverse crossproduct of the predictor matrix. Please remove duplicate variables or unique respondent names/numbers from the imputation model. It may be advisable to check the fraction of missing information (fmi) to evaluate the validity of the imputation model
## 3 TEAM_BATTING_H
## 4 * A ridge penalty had to be used to calculate the inverse crossproduct of the predictor matrix. Please remove duplicate variables or unique respondent names/numbers from the imputation model. It may be advisable to check the fraction of missing information (fmi) to evaluate the validity of the imputation model
## 5 TEAM_BATTING_H, TEAM_PITCHING_BB, TEAM_PITCHING_SO, TEAM_FIELDING_E
## 6 * A ridge penalty had to be used to calculate the inverse crossproduct of the predictor matrix. Please remove duplicate variables or unique respondent names/numbers from the imputation model. It may be advisable to check the fraction of missing information (fmi) to evaluate the validity of the imputation model
mb_valid_clean_imputed_2 <- complete(mb_valid_clean_imputed,2)
# create a list of 80% of the rows in the original dataset we can use for training
#validation_index <- createDataPartition(mb_train_imp_2$TARGET_WINS, p=0.80, list=FALSE)
# select 20% of the data for validation
#mb_valid_imp <- mb_train_imp_2[-validation_index,]
# use the remaining 80% of data to training and testing the models
#mb_train_imp <- mb_train_imp_2[validation_index,]
# output one of the imputed dataframes
complete_data_2 <- complete(mb_train_init_imputed,2)
head(complete_data_2)
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
# switched mb_eval to mb_valid_clean - I want to see how good predictions are on known target wins
mb_valid_clean <- mb_valid_clean[complete.cases(mb_valid_clean), ]
wins_pred <- predict(model_clean_st3, mb_valid_clean_imputed_2)
# wins_pred
wins_pred_neg <- subset(wins_pred, wins_pred < 0)
wins_pred_neg
## 285
## -23.70446
rmse <- rmse(mb_valid_clean_imputed_2$TARGET_WINS, wins_pred)
rmse
## [1] 15.87231
mape <- MAPE(mb_valid_clean_imputed_2$TARGET_WINS, wins_pred)
mape
## [1] 0.2325064
RMSE: 15.09709
MAPE: 0.2541715
wins_pred <- predict(stepmodel, mb_valid_clean_imputed_2)
# wins_pred
rmse <- rmse(mb_valid_clean_imputed_2$TARGET_WINS, wins_pred)
rmse
## [1] 16.25347
mape <- MAPE(mb_valid_clean_imputed_2$TARGET_WINS, wins_pred)
mape
## [1] 0.2836372
RMSE: 15.09709
MAPE: 0.2541715
# http://r-statistics.co/Linear-Regression.html
actual_preds <- data.frame(cbind(actuals=mb_valid_clean_imputed_2$TARGET_WINS, predicteds=wins_pred))
correlation_accuracy <- cor(actual_preds)
correlation_accuracy
## actuals predicteds
## actuals 1.0000000 0.3987365
## predicteds 0.3987365 1.0000000
# actual_preds
# STEP 2 TB (using Total Bases instead of hits individually)
# Removed TEAM_PITCHING_H
model_clean_st2tb <- lm(TARGET_WINS ~ TOTAL_BASES +
TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train_clean,
na.action = na.omit)
summary(model_clean_st2tb)
sigma(model_clean_st2tb) / mean(mb_train_clean$TARGET_WINS)
# STEP 3 TB
# Removed TEAM_PITCHING_BB
model_clean_st3tb <- lm(TARGET_WINS ~ TOTAL_BASES +
TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train_clean,
na.action = na.omit)
summary(model_clean_st3tb)
# Got worse
sigma(model_clean_st3tb) / mean(mb_train_clean$TARGET_WINS)
# STEP 3 - Got worse
# Removed TEAM_BATTING_HR
model_clean_st3 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train_clean,
na.action = na.omit)
summary(model_clean_st3)
sigma(model_clean_st3) / mean(mb_train_clean$TARGET_WINS)
fitted(model_clean)
model_2 <- lm(TARGET_WINS ~ TEAM_BATTING_3B + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP +
TEAM_PITCHING_BB + TEAM_FIELDING_E + TEAM_FIELDING_DP,
data=mb_train)
summary(model_2)
sigma(model_2) / mean(mb_train$TARGET_WINS)
Got worse … ha, I knew some of the hitting would need to be included
Add up all the hits for total bases
http://www.philsbaseball.com/Articles/2010_to_2014/2014/September/total_base_percentage.php#:~:text=Here%20is%20the%20formula%3A%20Total,4)%20by%20at%2Dbats. Here is the formula: Total Bases + walks + hit-by-pitches + stolen bases – caught stealing divided by plate appearances.
model_TB <- lm(TARGET_WINS ~ TOTAL_BASES +
TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP +
TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train)
summary(model_TB)
sigma(model_TB) / mean(mb_train$TARGET_WINS)
mb_train$TOTAL_BASES_PLUS <- mb_train$TEAM_BATTING_BB + mb_train$TOTAL_BASES
head(mb_train)
model_TB_plus <- lm(TARGET_WINS ~ TOTAL_BASES_PLUS +
TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP +
TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP,
data=mb_train)
summary(model_TB_plus)
sigma(model_TB_plus) / mean(mb_train$TARGET_WINS)